library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
## here() starts at /Users/marierivers/Documents/UCSB_Environmental_Data_Science/EDS_222_Statistics_for_Environmental_Data_Science/EDS_222_Final_Project
library(skimr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(stringr)
library(ggplot2)
library(patchwork)
library(gghighlight)
library(paletteer)
library(ggExtra)
library(ggrepel)
library(ggbeeswarm)
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
library(urbnmapr)
library(tidycensus)
library(jtools)
library(ggstance)
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
library(huxtable)
##
## Attaching package: 'huxtable'
## The following object is masked from 'package:kableExtra':
##
## add_footnote
## The following object is masked from 'package:GGally':
##
## wrap
## The following object is masked from 'package:dplyr':
##
## add_rownames
## The following object is masked from 'package:ggplot2':
##
## theme_grey
#library(tigris)
api_txt <- "~/Documents/UCSB_Environmental_Data_Science/EDS_222_Statistics_for_Environmental_Data_Science/Final_Project_Notes/census_key.txt"
api_key <- readLines(api_txt)
census_api_key(api_key)
## To install your API key for use in future sessions, run this function with `install = TRUE`.
fips_codes <- data.frame(fips_codes) %>%
rename(county_name = county) %>%
mutate(stfips = paste0(state_code, county_code))
mutate(pop_change_text = case_when( pop_change_pct > 0 ~ “positive”, pop_change_pct <= 0 ~ “negative”)) # Read in county level population data
header <- c("county_name", "april_1_2000", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "april_1_2010", "july_1_2010")
codes <- c("01", "02", "04", "05", "06", "08", "09", "10", "11", "12", "13", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "44", "45", "46", "47", "48", "49", "50", "51", "53", "54", "55", "56")
source(here("src", "state_pop_fun.R"))
county_pop = data.frame()
for (i in seq_along(codes)) {
state <- state_pop_fun(fips = codes[i])
state_df <- data.frame(state)
county_pop <- rbind(county_pop, state_df)
}
county_pop <- county_pop %>%
left_join(fips_codes, by = c("county_name", "state_code")) %>%
mutate(pop_change_pct = ((july_1_2010 - X2006) / X2006) * 100) %>%
mutate(pop_change_text = case_when(
pop_change_pct > 0 ~ "positive",
pop_change_pct <= 0 ~ "negative"))
county_pop_histogram <- ggplot(data = county_pop, aes(x = pop_change_pct)) +
geom_histogram(fill = "red", bins = 200)
county_pop_histogram
eqi_pop <- left_join(eqi, county_pop, by = c("stfips")) %>%
#relocate(c(stfips, state_code, county_code, state, state_name), .before = county_name) %>%
select(-april_1_2000, -X2000, -X2001, -X2002, -X2003, -X2004, -X2005, -X2007, -X2008, -X2009, -july_1_2010, -april_1_2010, -county_name.y, -county_name, -state_code.y, -county_code.y, -state.y, -state_name.y) %>%
filter(!X2006 == "null") %>%
rename(EQI = EQI_22July2013) %>%
rename(air_EQI = air_EQI_22July2013) %>%
rename(water_EQI = water_EQI_22July2013) %>%
rename(land_EQI = land_EQI_22July2013) %>%
rename(built_EQI = built_EQI_22July2013) %>%
rename(sociodemographic_EQI = sociod_EQI_22July2013) %>%
rename(state_code = state_code.x) %>%
rename(state_name = state_name.x) %>%
rename(state = state.x) %>%
rename(county_name = county_name.x) %>%
rename(county_code = county_code.x) %>%
select(-X2006)
eqi_pop_no_outlier <- eqi_pop %>%
filter(pop_change_pct < 100)
# xxx
write_csv(eqi_pop, here("data", "eqi_pop.csv"))
eqi_pop_change_plot_scatter <- ggplot(data = eqi_pop, aes(x = EQI, y = pop_change_pct)) +
geom_point(aes(color = pop_change_text)) +
theme(legend.position = "bottom") +
labs(title = "County percent population change vs. EQI, 2006-2010", x = "EQI", y = " % population change", color = "Population change:")
eqi_pop_change_plot <- ggMarginal(eqi_pop_change_plot_scatter, type = "boxplot", groupColour = TRUE)
eqi_pop_change_plot
eqi_pop_rucc_scatter <- ggplot(data = eqi_pop, aes(x = EQI, y = pop_change_pct)) +
geom_point(aes(color = cat_rucc), size = .75) +
theme(legend.position = "bottom") +
labs(title = "County percent population change vs. EQI, 2006-2010", x = "EQI", y = " % population change", color = "RUCC:")
eqi_pop_rucc_plot <- ggMarginal(eqi_pop_rucc_scatter, type = "boxplot", groupColour = TRUE)
eqi_pop_rucc_plot
eqi_pop_rucc_scatter2 <- ggplot(data = eqi_pop, aes(x = EQI, y = pop_change_pct)) +
geom_point(aes(color = rucc_text), size = .75) +
theme(legend.position = "bottom") +
labs(title = "County percent population change vs. EQI, 2006-2010", x = "EQI", y = " % population change", color = "RUCC:")
eqi_pop_rucc_plot2 <- ggMarginal(eqi_pop_rucc_scatter2, type = "boxplot", groupColour = TRUE)
eqi_pop_rucc_plot2
eqi_pop_rucc_scatter2_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = EQI, y = pop_change_pct)) +
geom_point(aes(color = rucc_text), size = .75) +
theme(legend.position = "bottom") +
labs(title = "County percent population change vs. EQI, 2006-2010", x = "EQI", y = " % population change", color = "RUCC:")
eqi_pop_rucc_plot2_no_outlier <- ggMarginal(eqi_pop_rucc_scatter2_no_outlier, type = "boxplot", groupColour = TRUE)
eqi_pop_rucc_plot2_no_outlier
county_pop_rucc_histogram <- ggplot(data = eqi_pop, aes(x = pop_change_pct)) +
geom_histogram(aes(fill = cat_rucc), position = "dodge", bins = 50)
county_pop_rucc_histogram
county_pop_rucc_histogram2 <- ggplot(data = eqi_pop, aes(x = pop_change_pct)) +
geom_histogram(aes(fill = rucc_text), position = "dodge", bins = 50)
county_pop_rucc_histogram2
pop_boxplot <- ggplot(data = eqi_pop, aes(x = cat_rucc, y = pop_change_pct)) +
geom_boxplot(aes(fill = cat_rucc), show.legend = FALSE) +
coord_flip()
pop_boxplot
pop_boxplot2 <- ggplot(data = eqi_pop, aes(x = pop_change_pct)) +
geom_boxplot() +
labs(title = "U.S. county level population change, 2006-2010", x = "percent population change") +
theme(axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank())
pop_boxplot2
eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = EQI, y = pop_change_pct)) +
geom_point()
air_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = air_EQI, y = pop_change_pct)) +
geom_point()
water_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = water_EQI, y = pop_change_pct)) +
geom_point()
land_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = land_EQI, y = pop_change_pct)) +
geom_point()
sociod_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = sociod_EQI, y = pop_change_pct)) +
geom_point()
built_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = built_EQI, y = pop_change_pct)) +
geom_point()
p1 <- land_eqi_pop_change_plot / built_eqi_pop_change_plot
p2 <- (air_eqi_pop_change_plot | water_eqi_pop_change_plot | land_eqi_pop_change_plot) + plot_layout(widths = 1)
row1 <- (eqi_pop_change_plot | p1) + plot_layout(widths = c(2, 1))
all_eqi_plot <- (row1 / p2) + plot_layout(heights = c(3, 1))
all_eqi_plot
Patchwork with no outlier
eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = EQI, y = pop_change_pct)) +
geom_point() +
labs(y = "% pop change") +
theme(axis.text.y = element_text(size = 6),
axis.text.x = element_text(size = 6))
air_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = air_EQI, y = pop_change_pct)) +
geom_point() +
labs(y = "% pop change") +
theme(axis.text.y = element_text(size = 6),
axis.text.x = element_text(size = 6))
water_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = water_EQI, y = pop_change_pct)) +
geom_point() +
labs(y = "% pop change") +
theme(axis.text.y = element_text(size = 6),
axis.text.x = element_text(size = 6))
land_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = land_EQI, y = pop_change_pct)) +
geom_point() +
labs(y = "% pop change") +
theme(axis.text.y = element_text(size = 6),
axis.text.x = element_text(size = 6))
sociod_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = sociodemographic_EQI, y = pop_change_pct)) +
geom_point() +
labs(y = "% pop change") +
theme(axis.text.y = element_text(size = 6),
axis.text.x = element_text(size = 6))
built_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = built_EQI, y = pop_change_pct)) +
geom_point() +
labs(y = "% pop change") +
theme(axis.text.y = element_text(size = 6),
axis.text.x = element_text(size = 6))
p1_no_outlier <- land_eqi_pop_change_plot_no_outlier / built_eqi_pop_change_plot_no_outlier
p2_no_outlier <- (air_eqi_pop_change_plot_no_outlier | water_eqi_pop_change_plot_no_outlier | sociod_eqi_pop_change_plot_no_outlier) + plot_layout(widths = 1)
row1_no_outlier <- (eqi_pop_change_plot_no_outlier | p1_no_outlier) + plot_layout(widths = c(2, 1))
all_eqi_plot_no_outlier <- (row1_no_outlier / p2_no_outlier) + plot_layout(heights = c(3, 1))
all_eqi_plot_no_outlier
histogram <- ggplot(data = eqi_pop, aes(x = EQI)) +
geom_histogram(bins = 100)
histogram
histogram_pop_change <- ggplot(data = eqi_pop, aes(x = EQI)) +
geom_histogram(aes(fill = pop_change_text), position = "dodge", bins = 50) +
labs(title = "Historgram of EQI based on county population change, 2006-2010", x = "EQI", fill = "Population change")
histogram_pop_change
histogram_eqi_rucc <- ggplot(data = eqi_pop, aes(x = EQI)) +
geom_histogram(aes(fill = rucc_text), position = "dodge", bins = 50) +
labs(title = "Historgram of EQI based on RUCC", x = "EQI", fill = "RUCC")
histogram_eqi_rucc
bin_scatter <-ggplot(data=eqi_pop, aes(x=EQI, y = pop_change_pct)) +
geom_bin2d(bins=50) + theme_bw() + geom_hline(yintercept=0, color="red")
bin_scatter
null_hypothesis: There is no difference in mean EQI for counties with positive and negative population change. \[H_{0}: \mu_{posPopChange} - \mu_{negPopChange} = 0\] alternative hypothesis: There is a difference in EQI for counties with positive and negative population change. \[H_{A}: \mu_{posPopChange} - \mu_{negPopChange} \neq 0\]
The standard error for a difference in means is defined as: \(SE = \sqrt{\frac{s_1^2}{n_1} + \frac{s^2_2}{n_2}}\) and the test-statistic for a hypothesis test is \(z = \frac{\text{point estimate - null}}{SE}\)
eqi_summary_table <- eqi_pop %>%
group_by(pop_change_text) %>%
summarise(min_eqi = round(min(EQI),2),
max_eqi = round(max(EQI),2),
mean_eqi = round(mean(EQI),2),
sd_eqi = round(sd(EQI), 2),
var_eqi = round(var(EQI), 2),
count = n()) %>%
bind_rows(., eqi_pop %>%
summarise(min_eqi = round(min(EQI),2),
max_eqi = round(max(EQI),2),
mean_eqi = round(mean(EQI),2),
sd_eqi = round(sd(EQI), 2),
var_eqi = round(var(EQI), 2),
count = n()) %>%
mutate(pop_change_text = "all counties")
) %>%
kable(col.names = c("Population Change", "Minimum", "Maximum", "Mean", "Standard Deviation", "Variance", "Number of Counties"), caption = "EQI summary statistics for counties based on population change between 2006 and 2010", color = 'black') %>%
kable_paper(full_width = FALSE) %>%
column_spec(1, bold = T) %>%
row_spec(0, bold = T, color = 'black') %>%
row_spec(3, bold = T, background = '#e5e5e5')
eqi_summary_table
| Population Change | Minimum | Maximum | Mean | Standard Deviation | Variance | Number of Counties |
|---|---|---|---|---|---|---|
| negative | -5.88 | 2.59 | -0.24 | 0.93 | 0.87 | 1088 |
| positive | -5.05 | 2.85 | 0.13 | 1.01 | 1.02 | 2052 |
| all counties | -5.88 | 2.85 | 0.00 | 1.00 | 1.00 | 3140 |
# mean values
mu_eqi_pos_pop_change <- eqi_pop %>%
filter(pop_change_pct > 0) %>%
summarise(mean_eqi = mean(EQI))
mu_eqi_neg_pop_change <- eqi_pop %>%
filter(pop_change_pct <= 0) %>%
summarise(mean_eqi = mean(EQI))
# standard deviations
sd_eqi_pos_pop_change <- eqi_pop %>%
filter(pop_change_pct > 0) %>%
summarise(sd_eqi = sd(EQI))
sd_eqi_neg_pop_change <- eqi_pop %>%
filter(pop_change_pct <= 0) %>%
summarise(sd_eqi = sd(EQI))
# count of observations
n_pos_pop_change <- eqi_pop %>%
filter(pop_change_pct > 0) %>%
summarise(count = n())
n_neg_pop_change <- eqi_pop %>%
filter(pop_change_pct <= 0) %>%
summarise(count = n())
print(sd_eqi_neg_pop_change)
## # A tibble: 1 × 1
## sd_eqi
## <dbl>
## 1 0.933
# calculate point estimate
point_est <- as.numeric(mu_eqi_pos_pop_change - mu_eqi_neg_pop_change)
print(point_est)
## [1] 0.375554
To calculate a standard error for a difference in means:
n = number of observations for each group s = standard deviation for each group \[SE = \sqrt{\frac{s_1^2}{n_1} + \frac{s^2_2}{n_2}}\]
# calculate standard error
SE <- as.numeric(sqrt( (sd_eqi_pos_pop_change^2 / n_pos_pop_change) + (sd_eqi_neg_pop_change^2 / n_neg_pop_change) ))
The definition of the z-score for hypothesis testing:
\[z_{score}=\frac{\text { point estimate }-\text { null value }}{S E}\]
z_score <- (point_est - 0) / SE
Calculated p-value: the probability of getting a point estimate at least as extreme as calculates above if the null hypothesis were true: \[p \text { - value }=\operatorname{Pr}(Z<-|z| \text { or } Z>|z|)=2 * \operatorname{Pr}(Z>|z|)\]
# Make use of the function `pnorm()` to access the normal distribution. Note: `pnorm()` gives you the probability mass below a certain cutoff in a probability distribution with a mean and standard deviation you can control. Use `lower.tail=FALSE` to get the mass above a given cutoff.
p_val = 2 * pnorm(point_est, mean = 0, sd = SE, lower.tail=FALSE)
print(p_val)
## [1] 1.80126e-25
Since \(p-value = 0.018 < 0.05\) we reject the null that there is no difference in the EQI of counties with positive population change versus negative population change. There is a statistically significant difference (at the 5% significance level) in EQI across the two population change groups.
Build a model for EQI vs population change and a model with coefficients for each EQI category to identify the category most correlated with population change
county_shp <- st_read("data/gz_2010_us_050_00_20m/gz_2010_us_050_00_20m.shp")
## Reading layer `gz_2010_us_050_00_20m' from data source
## `/Users/marierivers/Documents/UCSB_Environmental_Data_Science/EDS_222_Statistics_for_Environmental_Data_Science/EDS_222_Final_Project/data/gz_2010_us_050_00_20m/gz_2010_us_050_00_20m.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 3221 features and 6 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.1473 ymin: 17.88481 xmax: 179.7785 ymax: 71.35256
## Geodetic CRS: NAD83
project filter only 50 states + DC try to find a shapefile that makes alaska and hawaii look good
# https://urbaninstitute.github.io/urbnmapr/
# counties_sf <- get_urbn_map("counties", sf = TRUE)
#
# counties_sf %>%
# ggplot(aes()) +
# geom_sf(fill = "grey", color = "#ffffff")
county_map_df <- county_laea %>%
rename(stfips = GEOID) %>%
right_join(eqi_pop)
## Joining, by = "stfips"
## old-style crs object detected; please recreate object with a recent sf::st_crs()
No space in blog post for maps!
library(RColorBrewer)
pal <- brewer.pal(7, "OrRd")
plot(county_map_df["EQI"],
main = "Environmental Quality Index by County, 2006-2010",
breaks = "quantile",
nbreaks = 7,
pal = pal)
# county_map_df2 %>%
# ggplot(aes()) +
# geom_sf(aes(fill = pop_change_pct), color = "black", size = .5)
Linear regression models were created for each domain specific EQI to determine if a particular domain was more strongly correlated with population change.
eqi_model <- lm(pop_change_pct ~ EQI, data = eqi_pop)
summary(eqi_model)
##
## Call:
## lm(formula = pop_change_pct ~ EQI, data = eqi_pop)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.528 -2.839 -0.710 2.034 120.039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.90890 0.08731 21.86 <2e-16 ***
## EQI 0.99440 0.08739 11.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.892 on 3138 degrees of freedom
## Multiple R-squared: 0.03962, Adjusted R-squared: 0.03932
## F-statistic: 129.5 on 1 and 3138 DF, p-value: < 2.2e-16
eqi_rucc_model <- lm(pop_change_pct ~ EQI + rucc_text, data = eqi_pop)
summary(eqi_rucc_model)
##
## Call:
## lm(formula = pop_change_pct ~ EQI + rucc_text, data = eqi_pop)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.302 -2.683 -0.658 1.916 118.553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.69269 0.12295 5.634 1.92e-08 ***
## EQI 0.30978 0.09859 3.142 0.00169 **
## rucc_textmetro or urban 2.70580 0.19800 13.665 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.754 on 3137 degrees of freedom
## Multiple R-squared: 0.09358, Adjusted R-squared: 0.093
## F-statistic: 161.9 on 2 and 3137 DF, p-value: < 2.2e-16
air_eqi_model <- lm(pop_change_pct ~ air_EQI, data = eqi_pop)
water_eqi_model <- lm(pop_change_pct ~ water_EQI, data = eqi_pop)
land_eqi_model <- lm(pop_change_pct ~ land_EQI, data = eqi_pop)
built_eqi_model <- lm(pop_change_pct ~ built_EQI, data = eqi_pop)
sociodemographic_eqi_model <- lm(pop_change_pct ~ sociodemographic_EQI, data = eqi_pop)
plot_summs(eqi_model, air_eqi_model, water_eqi_model, land_eqi_model, built_eqi_model, sociodemographic_eqi_model, inner_ci_level = 0.9, model.names = c("EQI", "air EQI", "water EQI", "land EQI", "built EQI", "sociodem EQI"))
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
export_summs(eqi_model, air_eqi_model, water_eqi_model, land_eqi_model, built_eqi_model, sociodemographic_eqi_model,
digits = 3, error_format = "[{conf.low}, {conf.high}]",
model.names = c("EQI", "air EQI", "water EQI", "land EQI", "built EQI", "sociodem EQI"))
| EQI | air EQI | water EQI | land EQI | built EQI | sociodem EQI | |
|---|---|---|---|---|---|---|
| (Intercept) | 1.909 *** | 1.910 *** | 1.909 *** | 1.910 *** | 1.908 *** | 1.910 *** |
| [1.738, 2.080] | [1.739, 2.081] | [1.735, 2.083] | [1.735, 2.084] | [1.735, 2.082] | [1.739, 2.080] | |
| EQI | 0.994 *** | |||||
| [0.823, 1.166] | ||||||
| air_EQI | 1.038 *** | |||||
| [0.867, 1.209] | ||||||
| water_EQI | 0.432 *** | |||||
| [0.258, 0.606] | ||||||
| land_EQI | -0.307 *** | |||||
| [-0.481, -0.132] | ||||||
| built_EQI | 0.641 *** | |||||
| [0.467, 0.815] | ||||||
| sociodemographic_EQI | 1.055 *** | |||||
| [0.885, 1.226] | ||||||
| N | 3140 | 3140 | 3140 | 3140 | 3140 | 3140 |
| R2 | 0.040 | 0.043 | 0.007 | 0.004 | 0.016 | 0.045 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | ||||||